title: 'RGMQL Example R Notebook: Use case 2' author: "Silvia Cascianelli" date: "r Sys.Date()" output: pdf_document: default html_notebook: default html_document: default BiocStyle::html_document: chunk_output_type: inline


In this example we investigate the TCGA messenger and micro RNA gene expression data of patients affected by Adrenocortical carcinoma, showing how to to merge and process local data (even possibly private) with remote public data in mixed processing, performing analyses both remotely and on the local R/Bioconductor environment.

Load the RGMQL package and initialize the remote GMQL context of scalable data management engine, specifying remote_processing = TRUE, and, possibly, an authenticated login:

library(RGMQL)
remote_url <- "http://www.gmql.eu/gmql-rest"
init_gmql( url = remote_url, remote_processing = TRUE) # , username = 'XXXX', password = 'XXXX')

Read the TCGA miRNA gene expression data of Adrenocortical carcinoma aligned to GRCh38 from a local GDM dataset:

GRCh38_miRNA_path <- 'GRCh38_miRNA_ACC'
GRCh38_miRNA_ACC <- read_gmql(GRCh38_miRNA_path, is_local = TRUE)

Download and extract the list of datasets in the curated remote repository and focus on those concerning TCGA:

remote_processing(TRUE)
dataset_list <- show_datasets_list(remote_url)
list <- unlist(lapply(dataset_list[["datasets"]], function(x) x$name))
grep(pattern = 'TCGA', x = list, value = TRUE)

Choose the latest TCGA gene expression dataset of interest, aligned to GRCh38, read it and extract all the samples of patients affected by Adrenocortical carcinoma (ACC) :

GRCh38_TCGA_RNAseq <- read_gmql(dataset = "public.GRCh38_TCGA_gene_expression_2019_10", is_local = FALSE)

GRCh38_TCGA_RNAseq_ACC <- filter(GRCh38_TCGA_RNAseq, gdc__project__project_id == "TCGA-ACC")

Join the two datasets based on the biospecimen_aliquot__bcr_aliquot_barcode and keeping for each miRNA region the first gene region at minimum distance ((MD(1)):

ACC_mRNA_miRNA <- merge(GRCh38_miRNA_ACC,
                  GRCh38_TCGA_RNAseq_ACC,    
                  genometric_predicate = list(MD(1)),
                  region_output = "BOTH",
                  joinBy = conds
                  ('biospecimen__bio__bcr_analyte_barcode'))

Launch the remote processing execution to materialize the resulting ordered dataset:

collect(ACC_mRNA_miRNA, name = "ACC_mRNA_miRNA")
job<-execute()

Monitor the job status:

trace_job(remote_url, job$id)

Once the job status is 'SUCCESS' download the resulting dataset obtained remotely in the working directory of the local File System and turn the remote processing off:

name_dataset <- job$datasets[[1]]$name
download_dataset(remote_url, name_dataset, path = './Results_use_case_2')

Import mRNA and microRNA gene annotations together with their corresponding raw count values from the dataset just saved in the local File System to the current R environment, within a GRanges object:

# Uncomment to load previously obtained results, specifying the path until the downloaded dataset:
#name_dataset <- '_20220105_090918_ACC_mRNA_miRNA' 
remote_processing(FALSE)
path <- paste('./Results_use_case_2', name_dataset, sep = '/')
GR_ACC <- filter_and_extract(path, metadata = NULL, region_attributes = c('right.gene_symbol','right.htseq_count','left.mirna_id','left.read_count'))

Show all metadata to extract sample IDs and clinical annotations:

setwd('./Results_use_case_2')
meta_table <- show_all_metadata(name_dataset, show_value = TRUE)
sample_IDs<-unlist(meta_table['left.biospecimen__shared__bcr_patient_barcode',])
status<- unlist(meta_table["left.clinical__clin_shared__person_neoplasm_cancer_status",])
stages<- unlist(meta_table['left.gdc__diagnoses__tumor_stage',])
grades<- unlist(meta_table['left.gdc__diagnoses__tumor_grade',])

Log out from remote engine:

logout_gmql(remote_url)

Extract different samples as rows, considering as features an ordered joint list of raw counts for each mRNA and miRNA gene under analysis

all_genes <- c(GR_ACC@elementMetadata@listData[['right.gene_symbol...1']],GR_ACC@elementMetadata@listData[['left.mirna_id...3']])
all_genes_ord <- unique(sort(all_genes))

columns <- length(all_genes_ord)
rawMatrix <- matrix(nrow = 0, ncol = columns)

for (i in seq(1, length(GR_ACC@elementMetadata@listData), 4)) {
  g <- paste('right.gene_symbol...', toString(i), sep = '')
  m <- paste('left.mirna_id...', toString(i+2), sep = '')
  gr <- paste('right.htseq_count...', toString(i+1), sep = '')
  mr <- paste('left.read_count...', toString(i+3), sep = '')
  all_genes <- c(GR_ACC@elementMetadata@listData[[g]]
               ,GR_ACC@elementMetadata@listData[[m]])
  all_reads <- as.numeric(c(GR_ACC@elementMetadata@listData[[gr]],                      GR_ACC@elementMetadata@listData[[mr]]))
  names(all_reads) <- all_genes
  all_reads_ord <- all_reads[c(all_genes_ord)]
  rawMatrix <- rbind(rawMatrix, all_reads_ord)
}

rawData <- data.frame(rawMatrix)
rownames(rawData) <- sample_IDs

Discard all the mRNA or miRNA genes having a null first quartile value across the samples and normalize each sample by centering and dividing by standard deviation:

LQ <- apply(rawMatrix, 2, function(x) summary(x)[2])
kept <- which(LQ > 0)
dataset <- rawData[,kept]

library(BBmisc)
dataset_n <- normalize(dataset, 'standardize')

Use the remaining genes as features of each sample to perform a clustering analysis and identify subgroups of samples through hierarchical clustering. First, identify the optimal number of clusters based on the average silhouette score:

library(factoextra)

#Choice of the number of clusters
res <- fviz_nbclust(dataset_n, hcut , method = "silhouette", k.max = 10) 
plot(res)

Apply Ward Hierarchical Clustering using Spearman correlation as similarity metric; then cut the dendogram to highlight three sample clusters:

# 
d <- get_dist(x = dataset_n, method = "spearman") # distance matrix
hclust_model <- hclust(d, method = "ward.D")
plot(hclust_model) # display dendogram
clusters <- cutree(hclust_model, k = 3) # cut tree into clusters
# draw dendogram with coloured borders around the clusters
rect.hclust(hclust_model, k = 3, border = 2:5)

Plot the so-obtained clusters:

fviz_cluster(list(data = dataset_n, cluster = clusters),labelsize = 6)

Plot the number of samples in each cluster:

hist(clusters, breaks = c(0.5,1.5,2.5,3.5), labels = c('first cluster', 'second cluster', 'third cluster'), xaxt = 'n', ylab = 'Number of patients', ylim=c(0,55))
axis(side = 1, at=seq(1:3))

Now compare by a left join the clusters obtained with this clustering analysis and the clustering results published by the TCGA consortium in their Comprehensive Pan-Genomic Characterization of Adrenocortical Carcinoma:

clustering_results <- data.frame('sample' = sample_IDs , 'cluster' = clusters)
rownames(clustering_results)<- NULL

library(readr)
 CLUSTERS_TCGA_ACC <- read_delim("Results_use_case_2/CLUSTERS_TCGA_ACC.csv", ";", escape_double = FALSE, trim_ws = TRUE)

comparison<-merge(x = clustering_results, y =  CLUSTERS_TCGA_ACC,  by = 'sample', all.x = TRUE)

library('xlsx')
write.xlsx(comparison, file='Results_use_case_2/Comparison_clusterings.xlsx', col.names = TRUE, row.names = FALSE, append = FALSE)


for(i in 3:dim(comparison)[2]){
  comparison_table <- table(comparison$cluster, comparison[,i])
  subt<-paste('comparison with patient clusters based on', colnames(comparison)[i], collapse=' ')

  mosaicplot(comparison_table, color = TRUE, main = "Mosaic plot of two alternative clustering results",
           sub = subt,
           xlab = "Hierachical Clustering results on mRNA and miRNA genes:",
           ylab = colnames(comparison)[i],
           las = 1)
}

Similarly we compare the clusters obtained with this clustering analysis and the available TCGA survival annotations:

clustering_results <- data.frame('sample' = sample_IDs , 'cluster' = clusters)
rownames(clustering_results)<- NULL

library(readr)
SURVIVAL_TCGA_ACC <- read_delim("Results_use_case_2/SURVIVAL_TCGA_ACC.csv", ";", escape_double = FALSE, trim_ws = TRUE)

comparison_s<-merge(x = clustering_results, y =  SURVIVAL_TCGA_ACC,  by = 'sample', all.x = TRUE)

library('xlsx')
#uncomment to create an excel file from comparison_s
# write.xlsx(comparison_s, file='Results_use_case_2/Comparison_clustering_survival.xlsx', col.names = TRUE, row.names = FALSE, append = FALSE)

  comparison_table <- table(comparison_s$cluster, comparison_s$OS_STATUS)
  subt<-'comparison with overall survival status annotations'

  mosaicplot(comparison_table, color = TRUE, main = 'Mosaic plot of two alternative patient stratifications',
           sub = subt,
           xlab = 'Hierachical Clustering results on mRNA and miRNA genes:',
           ylab = 'Overall survival status',
           las = 1)

    comparison_table2 <- table(comparison_s$cluster, comparison_s$DFS_STATUS)
  subt<-'comparison with disease-free survival status annotations'

  mosaicplot(comparison_table2, color = TRUE, main = 'Mosaic plot of two alternative patient stratifications',
           sub = subt,
           xlab = 'Hierachical Clustering results on mRNA and miRNA genes:',
           ylab = 'Disease-free survival status',
           las = 1)

Eventually,

Eventually, evaluate how clinical annotations, like status and stages, are distributed within the clusters obtained with this clustering analysis:

clinical <- list(status, stages)#, grades are all not reported
par(cex.lab=0.55, cex.axis=1, cex.main=1)


for(c in clinical){
  comparison_table <- table(comparison$cluster, c)
  subt<-paste('Patient Hierachical Clustering on mRNA and miRNA genes compared with clusters based on clinical', substring(names(c)[1], first = 14), collapse=' ')


  mosaicplot(comparison_table, color = TRUE, main = "Mosaic plot of clustering results versus clinical stratification",
           xlab = subt,
           ylab = substring(names(c)[1], first = 14),
           las = 1)
}


DEIB-GECO/RGMQL documentation built on Feb. 17, 2024, 10:39 p.m.